#This script calculates cosine similarities with the opposition minus support vector
#It does so using the largest version (sample size)
#It does this for each country and each top 10 newspaper for Egypt and Tunisia
#These are needed for the synthetic DiD analyses
library(quanteda)
library(conText)
library(dplyr)
library(text2vec)
library(stringr)
library(tibble)
source("utils.R")

set.seed(123L)

# Define the list of countries. (For this example, we use two.)
countries <- c("masress", "turess")

# Define the opposition minus support vectors using single words
first_ar <- "المعارضة"
second_ar <- "الدعم"

# Read in the combined embedding files for the biggest version
local_transform <- readRDS("data/embedding_combined/combined_local_transform150000030k.rds")
local_glove <- readRDS("data/embedding_combined/combined_local_glove150000030k.rds")

# Adapted process_cos_sim: Instead of reading a corpus then tokenizing,
# we load the pre-saved tokens (which include docvars) and filter by newspaper.
process_cos_sim <- function(country_name, newspaper_name, local_glove, local_transform) {
  
  # Construct the path to the pre-saved tokens file
  toks_file <- paste0("data/analysis_toks/", country_name, "_target_toks_leader.rds")
  
  # Load the full tokens object for this country
  target_toks <- readRDS(toks_file)
  
  # Check that the tokens contain a 'newspaper' docvar
  if (is.null(docvars(target_toks)$newspaper)) {
    stop("The tokens object does not have a 'newspaper' docvar. Please add it during token generation.")
  }
  
  # Filter tokens by the given newspaper using the docvars information
  toks_subset <- target_toks[docvars(target_toks)$newspaper == newspaper_name]
  
  # For get_similarity_scores2 we need to supply a file path.
  # Save the filtered tokens to a temporary file.
  temp_file <- tempfile(fileext = ".rds")
  saveRDS(toks_subset, temp_file)
  
  cat("Getting cos_sims for country", country_name, "and newspaper", newspaper_name, "\n")
  
  # Compute cosine similarity on the tokens subset (instead of a corpus)
  cos_simsdf_all <- get_similarity_scores2(
    target_toks_file = temp_file,
    target = "TARGETWORD",
    first_vec = first_ar, 
    second_vec = second_ar, 
    pre_trained = local_glove,
    transform_matrix = local_transform,
    group_var = "yearwk",
    window = 12L,
    norm = "l2"
  )
  
  # Append metadata to the output
  cos_simsdf_all$newspaper <- newspaper_name
  cos_simsdf_all$country <- country_name
  
  return(cos_simsdf_all)
}

# We now read the pre-saved tokens to determine
# the top newspapers by country. Assume the tokens have the newspaper information.
masress_toks <- readRDS("data/analysis_toks/masress_target_toks_leader.rds")
turess_toks   <- readRDS("data/analysis_toks/turess_target_toks_leader.rds")

# Extract the docvars as a tibble and count the number of instances per newspaper
masress_sources <- as_tibble(docvars(masress_toks)) %>%
  group_by(newspaper) %>%
  count() %>%
  arrange(desc(n))

turess_sources <- as_tibble(docvars(turess_toks)) %>%
  group_by(newspaper) %>%
  count() %>%
  arrange(desc(n))

# Get the top 10 newspapers for each country based on the counts
top10_masress <- head(masress_sources$newspaper, 10)
top10_turess  <- head(turess_sources$newspaper, 10)

# Main loop: Process each country and its top 10 newspapers
results_list <- list()

for (country in countries) {
  
  if (country == "masress") {
    top_newspapers <- top10_masress
  } else if (country == "turess") {
    top_newspapers <- top10_turess
  }
  
  # List to store results for each newspaper in the current country
  all_newspapers_results <- list()
  
  for (newspaper in top_newspapers) {
    # Process the cosine similarity for each (country, newspaper) pair
    result <- process_cos_sim(country, newspaper, local_glove, local_transform)
    all_newspapers_results[[newspaper]] <- result
  }
  
  # Combine the results for all newspapers in the country
  combined_results <- bind_rows(all_newspapers_results, .id = "newspaper")
  results_list[[country]] <- combined_results
}

# Optionally, combine all countries’ results into one data frame and save
final_results <- bind_rows(results_list)

# Define output directory and save the final results
dir_name <- "data/output/cos_sims_mastun/"
if (!dir.exists(dir_name)) {
  dir.create(dir_name, recursive = TRUE)
}

saveRDS(final_results, paste0(dir_name, "/cos_simsdf_all_bysource_tokens.rds"))
